


# Load main source of data (hand collected from PDFs)
df2 <- read.csv("Data/Raw Data/CA_schools_final.csv",
               header = TRUE,
               stringsAsFactors = FALSE)

# Remove community college referendums and those without LEAID match
df2 <- df2[df2$Community_college ==0,]
df2 <- df2[!is.na(df2$LEAID),]

# Clean type fields
df2$type <- ifelse(df2$type == "GO Bond","Bond",df2$type)
df2$type <- ifelse(df2$levy_parcel >0,"levy_parcel",df2$type)
df2$type <- ifelse(!is.na(df2$bond_amount),"Bond",df2$type)
df2$type <- ifelse(df2$type == "Initiative","levy_parcel",df2$type)# That one initiative in Orange County is a levy tax
df2 <- df2[df2$type %in% c("Bond","levy_parcel","Property Tax"),]

# Change types of variables
df2$LEAID <- sprintf("%07d",df2$LEAID)
df2$date <- as.Date(df2$date,format = "%m/%d/%Y")


###############
## Combine votes for school districts that span multiple counties
###############
#FYI: school districts that span multiple counties enter the raw database as many times as counties. Because the analysis is at the referendum level (i.e. school disctrict), we combine the votes of such referendums enter the name of the school districts.


# Name of referendum in raw dataset
df2$measure_name <- gsub(" ","",df2$measure_name)

# Create an unique ID to identify a referendum based on School ID, name of measure, and date
df2$ref_id <- paste(df2$LEAID,df2$measure_name,df2$date,sep = "_")

# Aggregate votes based on this unique ID.
a <- aggregate(x = list(vote_for = df2$vote_for,vote_against = df2$vote_against,vote_total = df2$vote_total),
               by = list(ref_id = df2$ref_id),FUN = sum)

# Recreate main dataset (i.e. it will not have duplicated results)
df2 <- df2[,c("date","LEAID","type","threshold","bond_amount","levy_parcel","ref_id")]
df2 <- unique(df2)
df2 <- merge(df2,a)
rm(a)
df2$ref_id <- NULL


# Create a unique ID
df2 <- cbind(RefID =seq(1:nrow(df2)),df2)


df2$perc_yes <- df2$vote_for /df2$vote_total
df2$Pass <- ifelse(df2$perc_yes>df2$threshold,1,0)
df2$year <- as.numeric(substr(df2$date,1,4))
df2$odd_year <- df2$year %% 2

###### 
# Get SOI

##
soi <- read.csv("Data/Raw data/soi_school_districts.csv")
soi$School_District <- sprintf("%07d",soi$School_District)
soi <- soi[substr(soi$School_District,1,2)=="06",]#Keep California

soi$itm_salt_perc <- soi$itm_statetax / soi$nb


soi18 <- soi[soi$year ==2018,]
soi17 <- soi[soi$year ==2017,]

soi18 <- soi18[,c("School_District","itm_prop_perc")]
soi17 <- soi17[,c("School_District","itm_prop_perc")]

colnames(soi18)[2:length(soi18)] <- paste(colnames(soi18)[2:length(soi18)],"18",sep = "_")
colnames(soi17)[2:length(soi17)] <- paste(colnames(soi17)[2:length(soi17)],"17",sep = "_")

change <- merge(soi17,soi18)
rm(soi,soi17,soi18)
change$itm_prop_change <- change$itm_prop_perc_17 - change$itm_prop_perc_18

## Construct matrix of referendums
rdm <- as.data.frame(x = list(LEAID = rep(change$School_District,15)))
rdm$year <- rep(seq(2008,2022),each = length(change$School_District))
rdm$nb_refdum <- 0

# Count the number of referendum for each year and each school district
a <- aggregate(df2$RefID,by = list(year = df2$year,LEAID = df2$LEAID),FUN = length)

rdm <- merge(rdm,a,by= c("LEAID","year"),all.x = TRUE)
rdm$nb_refdum <- ifelse(is.na(rdm$x),0,rdm$x)
rdm$x <- NULL

rdm <- merge(rdm,change,by.x = "LEAID",by.y = "School_District")
rdm$post <- ifelse(rdm$year>=2019,1,0)
rdm$refdum_binary <- ifelse(rdm$nb_refdum>0,1,0)


rm(a,change,df2)
